

############################
### 01_attach_text.R #######
############################

library(pacman)
p_load(textreadr, dplyr, stringr, tidyr, haven, tokenizers, here, readxl, magrittr)
i_am("Code/01_attach_text.R")

#### Prepare to process text documents ####


#Get the names of all the docx files
transcriptnames <- as.data.frame(list.files(path = here("DOCX Documents"), pattern = "docx"))
transcriptnames <- cbind(seq(1:nrow(transcriptnames)), transcriptnames)

#Add .docx extension to the above variable so that we can load documents from the designated directory.
transcriptnames %<>% rename(names = `list.files(path = here("DOCX Documents"), pattern = "docx")`)

#make sure that the only file names in transcript documents have only the cassette number (such as 4-3), weekend number (such as 2), 
#scenario number (such as 12). Any other number such as "(1)" at the end of transcription document names should be removed. 
transcriptnames$cleaned <- str_remove(transcriptnames$names, "corr2|Corr2")

#The numeric characters in the transcription document names and juror level data jury transcript ID will be used to merge these two datasets.
#For this purpose we need to remove non-numeric characters. 
transcriptnames$transcriptnumbers<- str_remove_all(transcriptnames$cleaned,"[qwertyuioplkjhgfdsazxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM-]")
#also remove spaces
transcriptnames$transcriptnumbers<-str_remove_all(transcriptnames$transcriptnumbers," |\\.")

#Load jurorspeech.csv, containing RA-coded information. 
jurorspeech <- read.csv(here("Data", "jurorspeech.csv"))

# remove spaces and letters
jurorspeech$transcript_id_numbers<-str_remove_all(jurorspeech$transcript_id,"[qwertyuioplkjhgfdsazxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM:]")
jurorspeech$transcript_id_numbers<-str_remove_all(jurorspeech$transcript_id_numbers," ")
jurorspeech %>% group_by(transcript_id_numbers) %>% summarize(n()) %>% filter(`n()`!=6)
  #slash means unable to identify transcript; should be 270

#Create jurortext variable. We will import texts into this variable.
jurorspeech$jurortext<-NA

#To ensure consistency, we must remove all spaces and make them all upper cases within jurorspeech$Identifier. 
jurorspeech$Identifier<-str_remove_all(jurorspeech$Identifier," ")
jurorspeech$Identifier<-toupper(jurorspeech$Identifier)

transcriptnames$cleaned <- NULL
transcriptnames$`seq(1:nrow(transcriptnames))` <- NULL

#The for-loop reads each .docx document, separate them by ":" and create two columns (juror and text), remove spaces in juror variable, make it all upper case, 
#creates jurorsinajury object (which will store names of the individuals in a jury detected by RAs), and finally attaches text to each juror in each jury. 

jurorspeech$jurortext <- NA
jurorspeech$jurortext_rd1 <- NA
jurorspeech$jurortext_rd2 <- NA

#### Process text documents ####


# first, get full text and export a csv for each jury where 1 row = 1 speaking turn

for (i in 1:nrow(transcriptnames)) {
  #for each document, read in list w/ one speaking turn per row
  jury<-read_docx(here("DOCX Documents", transcriptnames$names[i]))
  DFjury<-as.data.frame(jury)
  DFjury[,1]<-as.character(DFjury[,1])
  #separate into speaker and words with colon
  DFjury_separated<- DFjury %>% separate(jury, c("juror", "text"), ":", extra="merge")
  
  #clean up text and export data by speaking turns
  #into folder of csv's for each transcript
  DFjury_separated[,2]<-str_replace_all(DFjury_separated[,2], "[’]", "'") 
  DFjury_separated[,2]<-str_replace_all(DFjury_separated[,2], "[…]", "...")
  write.csv(DFjury_separated, file=here("DOCX Documents", paste0("csv_speakingturns/",str_remove(transcriptnames$names[i], ".docx"),".csv")))
  
  #clean up juror names
  DFjury_separated$juror<-str_remove_all(DFjury_separated$juror," ")
  DFjury_separated$juror<-toupper(DFjury_separated$juror)
  
  #from juror dataset, get each unique speaker from this jury
  jurorsinajury<-jurorspeech$Identifier[jurorspeech$transcript_id_numbers==transcriptnames$transcriptnumbers[i]] 
  jurorsinajury<-as.data.frame(jurorsinajury[!is.na(jurorsinajury)])
  
  #in a jury, find the transcript lines that go with each
  #individual speaker and append them to the juror data
  
  for (j in 1:nrow(jurorsinajury)){
    jurorspeech[jurorspeech$transcript_id_numbers==transcriptnames$transcriptnumbers[i] & jurorspeech$Identifier == jurorsinajury[j,1]&!is.na(jurorspeech$Identifier),"jurortext"]<- paste0(DFjury_separated[DFjury_separated$juror==jurorsinajury[j,1],][,2], collapse = " ")
  }
}



# Repeat separately by round

for (i in 1:nrow(transcriptnames)) {
  #grab doc, read in list w/ one speaking turn per row
  jury<-read_docx(here("DOCX Documents", transcriptnames$names[i]))
  DFjury<-as.data.frame(jury)
  DFjury[,1]<-as.character(DFjury[,1])
  
  spot <- which(grepl("[Een ]velop([e #]*?)3", DFjury$jury))[1]
  rd1 <- as.data.frame(DFjury[c(1:spot),])
  rd2 <- as.data.frame(DFjury[c(spot:nrow(DFjury)),])
  
  for(k in 1:2){
  
  DFjury <- as.data.frame(unlist(c(rd1, rd2)[k]))
  colnames(DFjury) <- "jury"
      
  #separate into speaker and words with colon
  DFjury_separated<- DFjury %>% separate(jury, c("juror", "text"), ":", extra="merge")
  
  #clean up text
  DFjury_separated[,2]<-str_replace_all(DFjury_separated[,2], "[’]", "'") 
  DFjury_separated[,2]<-str_replace_all(DFjury_separated[,2], "[…]", "...")
  
  #clean up juror names
  DFjury_separated$juror<-str_remove_all(DFjury_separated$juror," ")
  DFjury_separated$juror<-toupper(DFjury_separated$juror)
  
  #from juror dataset, get each unique speaker from this jury
  jurorsinajury<-jurorspeech$Identifier[jurorspeech$transcript_id_numbers==transcriptnames$transcriptnumbers[i]] 
  jurorsinajury<-as.data.frame(jurorsinajury[!is.na(jurorsinajury)])
  
  #in a jury, find the transcript lines that go with each
  #individual speaker and append them to the juror data
  for (j in 1:nrow(jurorsinajury)){
    jurorspeech[jurorspeech$transcript_id_numbers==transcriptnames$transcriptnumbers[i] & jurorspeech$Identifier == jurorsinajury[j,1]&!is.na(jurorspeech$Identifier),paste0("jurortext_rd", k)]<- paste0(DFjury_separated[DFjury_separated$juror==jurorsinajury[j,1],][,2], collapse = " ")
  }
  }
}


#Merge jurorspeech and transcriptnames
jurorspeech2<-merge(jurorspeech, transcriptnames, all=TRUE, by.x= "transcript_id_numbers", by.y= "transcriptnumbers")

#Load JuryDataSummer2004.dta. Main juror level data. 
jurormainfile<-read.csv(here("Data", "JuryDataSummer2004.csv"))

#Merge it with jururspeech  data frame.
jurormainfile<- full_join(jurormainfile,jurorspeech2, by="case_id", multiple = "all")

# Export results
write.csv(jurormainfile, file=here("Data", "jurorspeechtrial.csv"))


#### Produce descriptive statistics dataset ####

# calculate total length (in words and speech turns), proportion attributed

names <- str_remove(transcriptnames$names, "\\.docx")

transcriptnames$turns <- NA
transcriptnames$words <- NA
transcriptnames$turns_attributed <- NA
transcriptnames$words_attributed <- NA

for(i in 1:length(names)){
  t <- read.csv(here("DOCX Documents", paste0("csv_speakingturns/", names[i], ".csv")))
  
  t$juror<-str_remove_all(t$juror," ")
  t$juror<-toupper(t$juror)
  t <- t[!grepl("ENVELOP|TAPE|TRANSCRIB|WEEK|SCENARIO", t$juror),]
  
  t$attributed <- !(t$juror %in% c("MAN", "WOMAN"))
  
  transcriptnames$turns[i] <- nrow(t)
  t_all <- paste0(t$text, collapse=" ")
  transcriptnames$words[i] <- str_count(t_all, "\\S+")
  
  transcriptnames$turns_attributed[i] <- sum(t$attributed)
  t_all <- paste0(t$text[t$attributed==1], collapse=" ")
  transcriptnames$words_attributed[i] <- str_count(t_all, "\\S+")
}

write.csv(transcriptnames, here("Data", "transcript_details.csv"))
